{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- A: integer (nullable = true)\n",
      " |-- B: integer (nullable = true)\n",
      " |-- C: double (nullable = true)\n",
      " |-- D: integer (nullable = true)\n",
      " |-- Spoiled: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder.appName('dog_food_tree').getOrCreate()\n",
    "df = spark.read.csv('dog_food.csv', inferSchema=True, header=True)\n",
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+----+---+-------+\n",
      "|  A|  B|   C|  D|Spoiled|\n",
      "+---+---+----+---+-------+\n",
      "|  4|  2|12.0|  3|    1.0|\n",
      "|  5|  6|12.0|  7|    1.0|\n",
      "+---+---+----+---+-------+\n",
      "only showing top 2 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import VectorAssembler\n",
    "\n",
    "assembler = VectorAssembler(inputCols = ['A', 'B', 'C', 'D'], outputCol = 'features')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "output = assembler.transform(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+----+---+-------+------------------+\n",
      "|  A|  B|   C|  D|Spoiled|          features|\n",
      "+---+---+----+---+-------+------------------+\n",
      "|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|\n",
      "|  5|  6|12.0|  7|    1.0|[5.0,6.0,12.0,7.0]|\n",
      "|  6|  2|13.0|  6|    1.0|[6.0,2.0,13.0,6.0]|\n",
      "+---+---+----+---+-------+------------------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "output.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import RandomForestClassifier\n",
    "\n",
    "rf = RandomForestClassifier(labelCol = 'Spoiled', featuresCol = 'features')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df = output.select('features', 'Spoiled')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+-------+\n",
      "|          features|Spoiled|\n",
      "+------------------+-------+\n",
      "|[4.0,2.0,12.0,3.0]|    1.0|\n",
      "|[5.0,6.0,12.0,7.0]|    1.0|\n",
      "|[6.0,2.0,13.0,6.0]|    1.0|\n",
      "+------------------+-------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "final_df.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "rf_model = rf.fit(final_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df.take(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SparseVector(4, {0: 0.0212, 1: 0.0166, 2: 0.9373, 3: 0.025})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rf_model.featureImportances"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
